{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Pong_Deep_Q_Learning_with_Fixed_targets.ipynb",
      "provenance": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/PacktPublishing/Modern-Computer-Vision-with-PyTorch/blob/master/Chapter16/Pong_Deep_Q_Learning_with_Fixed_targets.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "KeIaR6oerFRN"
      },
      "source": [
        "import gym\n",
        "import numpy as np\n",
        "import cv2\n",
        "from collections import deque\n",
        "import matplotlib.pyplot as plt\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "import torch.nn.functional as F\n",
        "import numpy as np\n",
        "import random\n",
        "from collections import namedtuple, deque\n",
        "import torch\n",
        "import torch.nn.functional as F\n",
        "import torch.optim as optim\n",
        "import matplotlib.pyplot as plt\n",
        "%matplotlib inline\n",
        "\n",
        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
        "\n",
        "env = gym.make('PongDeterministic-v0')\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "t44gFycfrIeT"
      },
      "source": [
        "state_size = env.observation_space.shape[0]\n",
        "action_size = env.action_space.n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "eewhAu_MrJ_P"
      },
      "source": [
        "def preprocess_frame(frame): \n",
        "    bkg_color = np.array([144, 72, 17])\n",
        "    img = np.mean(frame[34:-16:2,::2]-bkg_color, axis=-1)/255.\n",
        "    resized_image = img\n",
        "    return resized_image"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "F2JLUoEcrMY0"
      },
      "source": [
        "def stack_frames(stacked_frames, state, is_new_episode):\n",
        "    # Preprocess frame\n",
        "    frame = preprocess_frame(state)\n",
        "    stack_size = 4\n",
        "    if is_new_episode:\n",
        "        # Clear our stacked_frames\n",
        "        stacked_frames = deque([np.zeros((80,80), dtype=np.uint8) for i in range(stack_size)], maxlen=4)\n",
        "        # Because we're in a new episode, copy the same frame 4x\n",
        "        for i in range(stack_size):\n",
        "            stacked_frames.append(frame) \n",
        "        # Stack the frames\n",
        "        stacked_state = np.stack(stacked_frames, axis=2).transpose(2, 0, 1)\n",
        "    else:\n",
        "        # Append frame to deque, automatically removes the oldest frame\n",
        "        stacked_frames.append(frame)\n",
        "        # Build the stacked state (first dimension specifies different frames)\n",
        "        stacked_state = np.stack(stacked_frames, axis=2).transpose(2, 0, 1) \n",
        "    return stacked_state, stacked_frames\n",
        "class DQNetwork(nn.Module):\n",
        "    def __init__(self, states, action_size):\n",
        "        super(DQNetwork, self).__init__()\n",
        "        \n",
        "        self.conv1 = nn.Conv2d(4, 32, (8, 8), stride=4)\n",
        "        self.conv2 = nn.Conv2d(32, 64, (4, 4), stride=2)\n",
        "        self.conv3 = nn.Conv2d(64, 64, (3, 3), stride=1)\n",
        "        self.flatten = nn.Flatten()\n",
        "        self.fc1 = nn.Linear(2304, 512)\n",
        "        self.fc2 = nn.Linear(512, action_size)\n",
        "        \n",
        "    def forward(self, state): \n",
        "        x = F.relu(self.conv1(state))\n",
        "        x = F.relu(self.conv2(x))\n",
        "        x = F.relu(self.conv3(x))\n",
        "        x = self.flatten(x)\n",
        "        x = F.relu(self.fc1(x))\n",
        "        x = self.fc2(x)\n",
        "        return x"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "4rNQ6IthrUNB"
      },
      "source": [
        "class Agent():\n",
        "    def __init__(self, state_size, action_size):\n",
        "        \n",
        "        self.state_size = state_size\n",
        "        self.action_size = action_size\n",
        "        self.seed = random.seed(0)\n",
        "\n",
        "        ## hyperparameters\n",
        "        self.buffer_size = 10000\n",
        "        self.batch_size = 32\n",
        "        self.gamma = 0.99\n",
        "        self.lr = 0.0001\n",
        "        self.update_every = 4\n",
        "        self.update_every_target = 1000 \n",
        "        self.learn_every_target_counter = 0\n",
        "        # Q-Network\n",
        "        self.local = DQNetwork(state_size, action_size).to(device)\n",
        "        self.target = DQNetwork(state_size, action_size).to(device)\n",
        "        self.optimizer = optim.Adam(self.local.parameters(), lr=self.lr)\n",
        "\n",
        "        # Replay memory\n",
        "        self.memory = deque(maxlen=self.buffer_size) \n",
        "        self.experience = namedtuple(\"Experience\", field_names=[\"state\", \"action\", \"reward\", \"next_state\", \"done\"])\n",
        "        # Initialize time step (for updating every few steps)\n",
        "        self.t_step = 0\n",
        "    def step(self, state, action, reward, next_state, done):\n",
        "        # Save experience in replay memory\n",
        "        self.memory.append(self.experience(state[None], action, reward, next_state[None], done))\n",
        "        \n",
        "        # Learn every update_every time steps.\n",
        "        self.t_step = (self.t_step + 1) % self.update_every\n",
        "        if self.t_step == 0:\n",
        "   # If enough samples are available in memory, get random subset and learn\n",
        "            if len(self.memory) > self.batch_size:\n",
        "                experiences = self.sample_experiences()\n",
        "                self.learn(experiences, self.gamma)\n",
        "    def act(self, state, eps=0.):\n",
        "        # Epsilon-greedy action selection\n",
        "        if random.random() > eps:\n",
        "            state = torch.from_numpy(state).float().unsqueeze(0).to(device)\n",
        "            self.local.eval()\n",
        "            with torch.no_grad():\n",
        "                action_values = self.local(state)\n",
        "            self.local.train()\n",
        "            return np.argmax(action_values.cpu().data.numpy())\n",
        "        else:\n",
        "            return random.choice(np.arange(self.action_size))\n",
        "    def learn(self, experiences, gamma):\n",
        "        self.learn_every_target_counter+=1\n",
        "        states, actions, rewards, next_states, dones = experiences\n",
        "       # Get expected Q values from local model\n",
        "        Q_expected = self.local(states).gather(1, actions)\n",
        "\n",
        "        # Get max predicted Q values (for next states) from target model\n",
        "        Q_targets_next = self.target(next_states).detach().max(1)[0].unsqueeze(1)\n",
        "        # Compute Q targets for current state\n",
        "        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))\n",
        "        \n",
        "        # Compute loss\n",
        "        loss = F.mse_loss(Q_expected, Q_targets)\n",
        "\n",
        "        # Minimize the loss\n",
        "        self.optimizer.zero_grad()\n",
        "        loss.backward()\n",
        "        self.optimizer.step()\n",
        "\n",
        "        # ------------------- update target network ------------------- #\n",
        "        if self.learn_every_target_counter%1000 ==0:\n",
        "            self.target_update() \n",
        "    def target_update(self):\n",
        "        print('target updating')\n",
        "        self.target.load_state_dict(self.local.state_dict())\n",
        "    def sample_experiences(self):\n",
        "        experiences = random.sample(self.memory, k=self.batch_size)        \n",
        "        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)\n",
        "        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)\n",
        "        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)\n",
        "        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)\n",
        "        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)        \n",
        "        return (states, actions, rewards, next_states, dones)\n",
        "agent = Agent(state_size, action_size)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Y2ZVDl2yrrZX"
      },
      "source": [
        "n_episodes=5000\n",
        "max_t=5000\n",
        "eps_start=1.0\n",
        "eps_end=0.02\n",
        "eps_decay=0.995\n",
        "scores = [] # list containing scores from each episode\n",
        "scores_window = deque(maxlen=100) # last 100 scores\n",
        "eps = eps_start\n",
        "stack_size = 4\n",
        "stacked_frames = deque([np.zeros((80,80), dtype=np.int) for i in range(stack_size)], maxlen=stack_size) "
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "iACTglwrrs1L",
        "outputId": "1009afc5-f3fc-4c0e-b77f-8be3dca9253a",
        "colab": {
          "base_uri": "https://localhost:8080/"
        }
      },
      "source": [
        "for i_episode in range(1, n_episodes+1):\n",
        "    state = env.reset()\n",
        "    state, frames = stack_frames(stacked_frames, state, True)\n",
        "    score = 0\n",
        "    for i in range(max_t):\n",
        "        action = agent.act(state, eps)\n",
        "        next_state, reward, done, _ = env.step(action)\n",
        "        next_state, frames = stack_frames(frames, next_state, False)\n",
        "        agent.step(state, action, reward, next_state, done)\n",
        "        state = next_state\n",
        "        score += reward\n",
        "        if done:\n",
        "            break \n",
        "    scores_window.append(score) # save most recent score\n",
        "    scores.append(score) # save most recent score\n",
        "    eps = max(eps_end, eps_decay*eps) # decrease epsilon\n",
        "    print('\\rEpisode {}\\tReward {} \\tAverage Score: {:.2f} \\tEpsilon: {}'.format(i_episode,score,np.mean(scores_window), eps), end=\"\")\n",
        "    if i_episode % 100 == 0:\n",
        "        print('\\rEpisode {}\\tAverage Score: {:.2f} \\tEpsilon: {}'.format(i_episode, np.mean(scores_window), eps))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Episode 4\tReward -20.0 \tAverage Score: -20.25 \tEpsilon: 0.9801495006250001target updating\n",
            "Episode 8\tReward -21.0 \tAverage Score: -20.62 \tEpsilon: 0.960693043575437target updating\n",
            "Episode 13\tReward -21.0 \tAverage Score: -20.31 \tEpsilon: 0.9369146928798039target updating\n",
            "Episode 17\tReward -21.0 \tAverage Score: -20.47 \tEpsilon: 0.918316468354365target updating\n",
            "Episode 22\tReward -18.0 \tAverage Score: -20.36 \tEpsilon: 0.8955869907338783target updating\n",
            "Episode 26\tReward -21.0 \tAverage Score: -20.46 \tEpsilon: 0.8778091417340573target updating\n",
            "Episode 30\tReward -20.0 \tAverage Score: -20.47 \tEpsilon: 0.8603841919146962target updating\n",
            "Episode 35\tReward -20.0 \tAverage Score: -20.49 \tEpsilon: 0.8390886103705794target updating\n",
            "Episode 39\tReward -20.0 \tAverage Score: -20.44 \tEpsilon: 0.8224322824348486target updating\n",
            "Episode 43\tReward -21.0 \tAverage Score: -20.42 \tEpsilon: 0.8061065909263957target updating\n",
            "Episode 48\tReward -21.0 \tAverage Score: -20.42 \tEpsilon: 0.7861544476842928target updating\n",
            "Episode 52\tReward -20.0 \tAverage Score: -20.38 \tEpsilon: 0.7705488893118823target updating\n",
            "Episode 56\tReward -19.0 \tAverage Score: -20.30 \tEpsilon: 0.7552531090661897target updating\n",
            "Episode 60\tReward -20.0 \tAverage Score: -20.30 \tEpsilon: 0.7402609576967045target updating\n",
            "Episode 64\tReward -20.0 \tAverage Score: -20.31 \tEpsilon: 0.7255664080186093target updating\n",
            "Episode 68\tReward -18.0 \tAverage Score: -20.26 \tEpsilon: 0.7111635524897149target updating\n",
            "Episode 72\tReward -19.0 \tAverage Score: -20.24 \tEpsilon: 0.697046600835495target updating\n",
            "Episode 76\tReward -20.0 \tAverage Score: -20.24 \tEpsilon: 0.6832098777212641target updating\n",
            "Episode 80\tReward -20.0 \tAverage Score: -20.26 \tEpsilon: 0.6696478204705644target updating\n",
            "Episode 83\tReward -20.0 \tAverage Score: -20.24 \tEpsilon: 0.6596532430440636target updating\n",
            "Episode 87\tReward -20.0 \tAverage Score: -20.22 \tEpsilon: 0.6465587967553006target updating\n",
            "Episode 91\tReward -20.0 \tAverage Score: -20.21 \tEpsilon: 0.6337242817644086target updating\n",
            "Episode 95\tReward -21.0 \tAverage Score: -20.18 \tEpsilon: 0.6211445383053219target updating\n",
            "Episode 98\tReward -21.0 \tAverage Score: -20.16 \tEpsilon: 0.6118738784280476target updating\n",
            "Episode 100\tAverage Score: -20.16 \tEpsilon: 0.6057704364907278\n",
            "Episode 102\tReward -20.0 \tAverage Score: -20.15 \tEpsilon: 0.5997278763867329target updating\n",
            "Episode 106\tReward -20.0 \tAverage Score: -20.11 \tEpsilon: 0.5878229785513479target updating\n",
            "Episode 109\tReward -21.0 \tAverage Score: -20.06 \tEpsilon: 0.5790496471185967target updating\n",
            "Episode 111\tReward -16.0 \tAverage Score: -19.99 \tEpsilon: 0.5732736268885887target updating\n",
            "Episode 115\tReward -20.0 \tAverage Score: -19.97 \tEpsilon: 0.5618938591163328target updating\n",
            "Episode 117\tReward -18.0 \tAverage Score: -19.91 \tEpsilon: 0.5562889678716474target updating\n",
            "Episode 120\tReward -18.0 \tAverage Score: -19.83 \tEpsilon: 0.547986285490042target updating\n",
            "Episode 124\tReward -19.0 \tAverage Score: -19.79 \tEpsilon: 0.5371084840724134target updating\n",
            "Episode 126\tReward -19.0 \tAverage Score: -19.71 \tEpsilon: 0.531750826943791target updating\n",
            "Episode 129\tReward -18.0 \tAverage Score: -19.68 \tEpsilon: 0.5238143793828016target updating\n",
            "Episode 132\tReward -17.0 \tAverage Score: -19.64 \tEpsilon: 0.5159963842937159target updating\n",
            "Episode 134\tReward -18.0 \tAverage Score: -19.58 \tEpsilon: 0.510849320360386target updating\n",
            "Episode 137\tReward -15.0 \tAverage Score: -19.49 \tEpsilon: 0.5032248303978422target updating\n",
            "Episode 139\tReward -21.0 \tAverage Score: -19.48 \tEpsilon: 0.4982051627146237target updating\n",
            "Episode 142\tReward -17.0 \tAverage Score: -19.45 \tEpsilon: 0.4907693883854626target updating\n",
            "Episode 145\tReward -18.0 \tAverage Score: -19.31 \tEpsilon: 0.483444593917636target updating\n",
            "Episode 147\tReward -19.0 \tAverage Score: -19.30 \tEpsilon: 0.47862223409330756target updating\n",
            "Episode 150\tReward -17.0 \tAverage Score: -19.22 \tEpsilon: 0.47147873742168567target updating\n",
            "Episode 152\tReward -21.0 \tAverage Score: -19.23 \tEpsilon: 0.46677573701590436target updating\n",
            "Episode 155\tReward -19.0 \tAverage Score: -19.22 \tEpsilon: 0.4598090507939749target updating\n",
            "Episode 157\tReward -20.0 \tAverage Score: -19.19 \tEpsilon: 0.45522245551230495target updating\n",
            "Episode 159\tReward -21.0 \tAverage Score: -19.20 \tEpsilon: 0.4506816115185697target updating\n",
            "Episode 162\tReward -20.0 \tAverage Score: -19.16 \tEpsilon: 0.4439551321314536target updating\n",
            "Episode 164\tReward -19.0 \tAverage Score: -19.11 \tEpsilon: 0.43952667968844233target updating\n",
            "Episode 167\tReward -18.0 \tAverage Score: -19.07 \tEpsilon: 0.43296668905325736target updating\n",
            "Episode 169\tReward -18.0 \tAverage Score: -19.05 \tEpsilon: 0.4286478463299511target updating\n",
            "Episode 171\tReward -17.0 \tAverage Score: -18.98 \tEpsilon: 0.42437208406280985target updating\n",
            "Episode 174\tReward -20.0 \tAverage Score: -18.98 \tEpsilon: 0.4180382776616619target updating\n",
            "Episode 176\tReward -17.0 \tAverage Score: -18.92 \tEpsilon: 0.41386834584198684target updating\n",
            "Episode 178\tReward -17.0 \tAverage Score: -18.84 \tEpsilon: 0.40974000909221303target updating\n",
            "Episode 181\tReward -17.0 \tAverage Score: -18.79 \tEpsilon: 0.4036245882390106target updating\n",
            "Episode 183\tReward -17.0 \tAverage Score: -18.75 \tEpsilon: 0.3995984329713264target updating\n",
            "Episode 185\tReward -18.0 \tAverage Score: -18.70 \tEpsilon: 0.39561243860243744target updating\n",
            "Episode 187\tReward -20.0 \tAverage Score: -18.69 \tEpsilon: 0.39166620452737816target updating\n",
            "Episode 190\tReward -20.0 \tAverage Score: -18.69 \tEpsilon: 0.3858205374665315target updating\n",
            "Episode 192\tReward -17.0 \tAverage Score: -18.66 \tEpsilon: 0.3819719776053028target updating\n",
            "Episode 194\tReward -16.0 \tAverage Score: -18.58 \tEpsilon: 0.37816180712868996target updating\n",
            "Episode 196\tReward -16.0 \tAverage Score: -18.50 \tEpsilon: 0.3743896431025813target updating\n",
            "Episode 198\tReward -15.0 \tAverage Score: -18.44 \tEpsilon: 0.3706551064126331target updating\n",
            "Episode 200\tAverage Score: -18.39 \tEpsilon: 0.3669578217261671\n",
            "target updating\n",
            "Episode 202\tReward -20.0 \tAverage Score: -18.34 \tEpsilon: 0.3632974174544486target updating\n",
            "Episode 204\tReward -16.0 \tAverage Score: -18.27 \tEpsilon: 0.3596735257153405target updating\n",
            "Episode 205\tReward -14.0 \tAverage Score: -18.22 \tEpsilon: 0.3578751580867638target updating\n",
            "Episode 208\tReward -19.0 \tAverage Score: -18.21 \tEpsilon: 0.35253382661792404target updating\n",
            "Episode 210\tReward -19.0 \tAverage Score: -18.23 \tEpsilon: 0.34901730169741024target updating\n",
            "Episode 212\tReward -19.0 \tAverage Score: -18.25 \tEpsilon: 0.3455358541129786target updating\n",
            "Episode 214\tReward -16.0 \tAverage Score: -18.14 \tEpsilon: 0.3420891339682016target updating\n",
            "Episode 216\tReward -19.0 \tAverage Score: -18.13 \tEpsilon: 0.3386767948568688target updating\n",
            "Episode 218\tReward -16.0 \tAverage Score: -18.10 \tEpsilon: 0.3352984938281715target updating\n",
            "Episode 220\tReward -20.0 \tAverage Score: -18.12 \tEpsilon: 0.33195389135223546target updating\n",
            "Episode 222\tReward -14.0 \tAverage Score: -18.06 \tEpsilon: 0.32864265128599696target updating\n",
            "Episode 224\tReward -17.0 \tAverage Score: -18.03 \tEpsilon: 0.3253644408394192target updating\n",
            "Episode 225\tReward -15.0 \tAverage Score: -18.03 \tEpsilon: 0.3237376186352221target updating\n",
            "Episode 227\tReward -20.0 \tAverage Score: -18.00 \tEpsilon: 0.32050833588933575target updating\n",
            "Episode 229\tReward -16.0 \tAverage Score: -17.95 \tEpsilon: 0.3173112652388396target updating\n",
            "Episode 231\tReward -17.0 \tAverage Score: -17.85 \tEpsilon: 0.3141460853680822target updating\n",
            "Episode 233\tReward -16.0 \tAverage Score: -17.83 \tEpsilon: 0.31101247816653554target updating\n",
            "Episode 235\tReward -18.0 \tAverage Score: -17.79 \tEpsilon: 0.3079101286968243target updating\n",
            "Episode 236\tReward -17.0 \tAverage Score: -17.77 \tEpsilon: 0.3063705780533402target updating\n",
            "Episode 238\tReward -14.0 \tAverage Score: -17.70 \tEpsilon: 0.3033145315372582target updating\n",
            "Episode 240\tReward -18.0 \tAverage Score: -17.62 \tEpsilon: 0.30028896908517405target updating\n",
            "Episode 242\tReward -16.0 \tAverage Score: -17.60 \tEpsilon: 0.29729358661854943target updating\n",
            "Episode 244\tReward -20.0 \tAverage Score: -17.70 \tEpsilon: 0.2943280830920294target updating\n",
            "Episode 246\tReward -15.0 \tAverage Score: -17.68 \tEpsilon: 0.2913921604631864target updating\n",
            "Episode 248\tReward -17.0 \tAverage Score: -17.66 \tEpsilon: 0.2884855236625661target updating\n",
            "Episode 251\tReward -20.0 \tAverage Score: -17.71 \tEpsilon: 0.28417984116121187target updating\n",
            "Episode 252\tReward -18.0 \tAverage Score: -17.68 \tEpsilon: 0.2827589419554058target updating\n",
            "Episode 254\tReward -16.0 \tAverage Score: -17.61 \tEpsilon: 0.2799384215094006target updating\n",
            "Episode 257\tReward -17.0 \tAverage Score: -17.57 \tEpsilon: 0.2757603055760701target updating\n",
            "Episode 258\tReward -17.0 \tAverage Score: -17.54 \tEpsilon: 0.2743815040481898target updating\n",
            "Episode 260\tReward -20.0 \tAverage Score: -17.52 \tEpsilon: 0.27164454854530906target updating\n",
            "Episode 262\tReward -16.0 \tAverage Score: -17.49 \tEpsilon: 0.2689348941735696target updating\n",
            "Episode 263\tReward -12.0 \tAverage Score: -17.45 \tEpsilon: 0.26759021970270175target updating\n",
            "Episode 265\tReward -17.0 \tAverage Score: -17.37 \tEpsilon: 0.2649210072611673target updating\n",
            "Episode 267\tReward -18.0 \tAverage Score: -17.35 \tEpsilon: 0.26227842021373715target updating\n",
            "Episode 268\tReward -16.0 \tAverage Score: -17.33 \tEpsilon: 0.2609670281126685target updating\n",
            "Episode 270\tReward -16.0 \tAverage Score: -17.30 \tEpsilon: 0.2583638820072446target updating\n",
            "Episode 272\tReward -18.0 \tAverage Score: -17.27 \tEpsilon: 0.25578670228422234target updating\n",
            "Episode 274\tReward -15.0 \tAverage Score: -17.18 \tEpsilon: 0.2532352299289372target updating\n",
            "Episode 276\tReward -18.0 \tAverage Score: -17.16 \tEpsilon: 0.2507092085103961target updating\n",
            "Episode 278\tReward -19.0 \tAverage Score: -17.20 \tEpsilon: 0.24820838415550486target updating\n",
            "Episode 280\tReward -14.0 \tAverage Score: -17.12 \tEpsilon: 0.2457325055235537target updating\n",
            "Episode 281\tReward -15.0 \tAverage Score: -17.10 \tEpsilon: 0.24450384299593592target updating\n",
            "Episode 283\tReward -21.0 \tAverage Score: -17.03 \tEpsilon: 0.24206491716205145target updating\n",
            "Episode 285\tReward -17.0 \tAverage Score: -16.98 \tEpsilon: 0.23965031961336target updating\n",
            "Episode 287\tReward -17.0 \tAverage Score: -16.92 \tEpsilon: 0.23725980767521673target updating\n",
            "Episode 289\tReward -18.0 \tAverage Score: -16.81 \tEpsilon: 0.23489314109365644target updating\n",
            "Episode 290\tReward -12.0 \tAverage Score: -16.73 \tEpsilon: 0.23371867538818816target updating\n",
            "Episode 292\tReward -16.0 \tAverage Score: -16.69 \tEpsilon: 0.231387331601191target updating\n",
            "Episode 294\tReward -15.0 \tAverage Score: -16.71 \tEpsilon: 0.2290792429684691target updating\n",
            "Episode 296\tReward -16.0 \tAverage Score: -16.72 \tEpsilon: 0.22679417751985861target updating\n",
            "Episode 298\tReward -15.0 \tAverage Score: -16.66 \tEpsilon: 0.22453190559909803target updating\n",
            "Episode 299\tReward -14.0 \tAverage Score: -16.64 \tEpsilon: 0.22340924607110255target updating\n",
            "Episode 300\tAverage Score: -16.61 \tEpsilon: 0.22229219984074702\n",
            "Episode 301\tReward -17.0 \tAverage Score: -16.63 \tEpsilon: 0.2211807388415433target updating\n",
            "Episode 302\tReward -17.0 \tAverage Score: -16.60 \tEpsilon: 0.22007483514733558target updating\n",
            "Episode 304\tReward -13.0 \tAverage Score: -16.59 \tEpsilon: 0.2178795886667409target updating\n",
            "Episode 305\tReward -14.0 \tAverage Score: -16.59 \tEpsilon: 0.2167901907234072target updating\n",
            "Episode 307\tReward -19.0 \tAverage Score: -16.57 \tEpsilon: 0.21462770857094118target updating\n",
            "Episode 309\tReward -18.0 \tAverage Score: -16.54 \tEpsilon: 0.21248679717794605target updating\n",
            "Episode 310\tReward -15.0 \tAverage Score: -16.50 \tEpsilon: 0.21142436319205632target updating\n",
            "Episode 312\tReward -14.0 \tAverage Score: -16.44 \tEpsilon: 0.20931540516921554target updating\n",
            "Episode 313\tReward -18.0 \tAverage Score: -16.49 \tEpsilon: 0.20826882814336947target updating\n",
            "Episode 314\tReward -19.0 \tAverage Score: -16.52 \tEpsilon: 0.20722748400265262target updating\n",
            "Episode 316\tReward -17.0 \tAverage Score: -16.45 \tEpsilon: 0.20516038984972615target updating\n",
            "Episode 317\tReward -14.0 \tAverage Score: -16.41 \tEpsilon: 0.2041345879004775target updating\n",
            "Episode 319\tReward -16.0 \tAverage Score: -16.42 \tEpsilon: 0.20209834538617025target updating\n",
            "Episode 321\tReward -12.0 \tAverage Score: -16.33 \tEpsilon: 0.2000824143909432target updating\n",
            "Episode 322\tReward -16.0 \tAverage Score: -16.35 \tEpsilon: 0.19908200231898848target updating\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "BCNoMUGyrwrR"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}